# Import the library
library(readr)
library(naniar)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date() masks base::date()
## ✖ plotly::filter() masks dplyr::filter(), stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ lubridate::union() masks base::union()
# Read the csv file
df=read_csv("Datasets/theses_v2.csv",show_col_types = FALSE)
head(df)
#Plot the missing data graph
vis_miss(df, warn_large_data = FALSE)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
## Please report the issue at <]8;;https://github.com/ropensci/visdat/issueshttps://github.com/ropensci/visdat/issues]8;;>.
x <- seq(1, as.integer(0.8 * nrow(df)))
y <- rnorm(x, mean = 200, sd = 50)
missing = nrow(df) - as.integer( 0.8 * nrow(df))
na_col <- rep(NA, missing)
set.seed(100)
n.pages = sample(c(as.integer(y), na_col))
df$n.pages <- n.pages
head(df$n.pages, 10)
## [1] 145 156 201 189 203 167 NA 209 NA 190
##The proportion of defences at the first of january evolve over the years
# Load the Date de soutenance column
str(df$"Date de soutenance")
## chr [1:447644] NA NA "01-01-93" NA NA "24-11-08" "01-07-05" "08-12-09" ...
dt <- df$"Date de soutenance"
head(dt, n=10)
## [1] NA NA "01-01-93" NA NA "24-11-08"
## [7] "01-07-05" "08-12-09" "10-01-13" "24-06-11"
# Convert to class "Date" representing calendar dates
dt <- as.Date(dt, "%d-%m-%y")
# Create data frame df_date
df_date <- data.frame(dt)
df_date <- na.omit(df_date)
head(df_date)
# Parse and manipulate dates into different column (year, month, day)
df_date <- df_date %>% dplyr::mutate(year = lubridate::year(dt), month = lubridate::month(dt), day = lubridate::day(dt))
head(df_date)
# Create data frame newyear
newyear <- df_date %>% filter(month == 1)
newyear<- newyear %>% filter(day == 1)
head(newyear)
#Group and count by year in df_newyear
df_newyear <- newyear %>% select(year) %>% group_by(year) %>% count()
head(df_newyear)
#Group and count by year in df_date_all
df_date_all <-df_date %>% select(year) %>% group_by(year) %>% count()
head(df_date_all)
# Mutating joins add columns from from df_newyear to df_date_all
df_proportion <- inner_join(df_newyear, df_date_all, by ="year")
df_proportion$proportion <- df_proportion$n.x/df_proportion$n.y
head(df_proportion)
# Plot the proportion of defences at the first of january evolve over the years
ggplot(df_proportion, aes(x=year, y=proportion)) +
geom_line() + ggtitle("Proportion of PhD defended on the first of January over the years")
subset(df_proportion, year > 2005 & year < 2015)
#the proportion of defenses at the first of January started decreasing from 2006
##Cecile Martin problems
# Load the Cécile Martin from Auteur column in df
Cecile <- filter(df, df$Auteur == "Cecile Martin")
Cecile
#There are 4 different people with the same name Cecile, 1 of them has 4 theses
#Supervisor’s ID
#Create supervisor id data frame with the length column
supervisor_id <- df$`Identifiant directeur`
df_sup <- data.frame(supervisor_id)
df_sup <- na.omit(df_sup)
df_sup$length <- nchar(df_sup$supervisor_id)
head(df_sup)
df_sup2 <- df_sup %>% select(length) %>% group_by(length) %>% count()
head(df_sup2)
head(df_date_all)
#Plot
ggplot(df_date_all, aes(x=year, y=n)) +
geom_line() + ggtitle("The number of PhD defended over the years")
#There is a sudden drop in the number of PhD defended in 2019 and 2020.
subset(df_date_all, year > 2015 & year < 2021)
languages_date <- df[,c("Date de soutenance","Langue de la these")]
head(languages_date)
languages_date$"Date de soutenance" <- as.Date(languages_date$"Date de soutenance", "%d-%m-%y")
head(languages_date)
#Create data frame df_languages_date have 2 columns Date & Language
df_languages_date <- data.frame(languages_date)
df_languages_date <- na.omit(df_languages_date)
df_languages_date <- data.frame(df_languages_date)
colnames(df_languages_date) <- c("Date", "Language")
head(df_languages_date)
#Lower the character in Language col
df_languages_date$Language <- tolower(df_languages_date$Language)
head(df_languages_date)
#Create new col "Type" using mutate
df_languages_date <- df_languages_date %>% mutate(Type = case_when(
(Language == "en") ~ "English",
(Language == "fr") ~ "French",
(Language == "enfr" | Language == "fren") ~ "Bilingual",
TRUE ~ "Other",
))
head(df_languages_date)
unique(df_languages_date$Type)
## [1] "French" "English" "Other" "Bilingual"
df_lang_type <- df_languages_date %>% group_by(Type) %>% count()
head(df_lang_type)
#Parsing dates using lubridate and create new col "Year"
df_languages_date <- df_languages_date %>% dplyr::mutate(Year = lubridate::year(Date))
df_languages_date <- df_languages_date[order(df_languages_date$Year),]
head(df_languages_date)
#Check unique
unique(df_languages_date$Year)
## [1] 1971 1972 1973 1976 1979 1980 1982 1984 1985 1986 1987 1988 1989 1990 1991
## [16] 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [31] 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
#Sum by year and type of the languages
df_la_type_year <- df_languages_date %>% select(Year, Type) %>% group_by(Year, Type) %>% count()
colnames(df_la_type_year) <- c("Year", "Type", "Sum")
head(df_la_type_year)
#Sum by year
df_year <- df_languages_date %>% select(Year) %>% group_by(Year) %>% count()
colnames(df_year) <- c("Year", "Sum_Year")
head(df_year)
#Merge 2 df
full_lang_type <- full_join(df_la_type_year, df_year, by = 'Year')
head(full_lang_type)
#Calculate percentage of sum
full_lang_type$Sum_Percentage <- round((full_lang_type$Sum / full_lang_type$Sum_Year) * 100,2)
head(full_lang_type)
# Plot
ggplot(full_lang_type, aes(x=Year, y=Sum_Percentage, fill=Type)) +
geom_area(alpha=0.7 , size=1, colour="white") +
ggtitle("The choice of the language of the manuscript evolved over the past decades")
df_plotly <- full_lang_type %>% filter(Year >= 1994 & Year <2020)
plot_ly(type = 'scatter', x = df_plotly$Year, y = df_plotly$Sum_Percentage, color = df_plotly$Type,
mode = 'lines', fill = 'tonexty')
University <- df %>% group_by(`Etablissement de soutenance`) %>% summarise(n=n()) %>% arrange(desc(n))
uni <-head(University, n=10)
uni
# load the library
library(forcats)
# Reorder following the value of another column:
uni %>%
ggplot( aes(x=uni$`Etablissement de soutenance`, y=uni$n)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
coord_flip() +
xlab("University") +
ylab("Total Theses") +
ggtitle("Top 10 universities have the most theses from 1971 to 2020") +
theme_bw()
## Warning: Use of `uni$`Etablissement de soutenance`` is discouraged. Use
## `Etablissement de soutenance` instead.
## Warning: Use of `uni$n` is discouraged. Use `n` instead.
Public <- df %>% group_by(`Accessible en ligne`) %>% summarise(n=n()) %>% arrange(desc(n))
Public
labels = c( "no","yes")
pie(Public$n,Public$`Accessible en ligne`, main="The protortion of theses accessable online")